[ Computer Vision (Read API) ] AI-OCRでFAX送信された帳票をCSV化してみました

#OpenCV

SIN

2023.07.03

この記事は公開されてから1年以上経過しています。情報が古い可能性がありますので、ご注意ください。

1 はじめに

CX 事業本部 delivery部の平内（SIN）です。

一昔前まで、OCRによるテキスト化は、誤変換が多くて、なかなか実用が難しいというイメージがあったのですが、最近のAI-OCRは、日本語や手書きのものも結構な精度で読み取れるようになっています。そして、モデルは、どんどん更新されているので、今後、ますます、精度は上がっていくでしょう。

今回は、AI-OCRを利用して、帳票をCSV化する作業を試してみました。

2 歪みの修正

FAXで受信した帳票は、やや斜めになったり、歪んでしまうことがあります。この状態では、帳票の枠組みを検出するのが難しいので、長方形になるように補正します。

修正の手順は、以下の通りです。

グレースケール変換
エッジ抽出
膨張処理
最大矩形検出
射影変換

最初にサンプルとなったFAXの画像です。

fax.png

罫線の検出を簡単しやすくするために、グレースケールに変換します。

gray_image = cv2.cvtColor(org_img, cv2.COLOR_BGR2GRAY)

fax_glay.png

続いて、エッジの検出です。

edges_image = cv2.Canny(gray_image, 1, 100, apertureSize=3)

fax_edges.png

エッジ検出すると、やや線が細くなってしまって、取りこぼしが発生する可能性があるので、膨張処理を施します。

kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
dilate_image = cv2.dilate(edges_image, kernel)

fax_dilate.png

膨張処理された画像から、cv2.RETR_EXTERNALで、一番外側の輪郭のみ抽出しています。

contours, hierarchy = cv2.findContours(
    dilate_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)

複数検出された場合は、すべての中で、最も大きいものを抽出し、帳票の外枠であると仮定しています。

fax_rect.png

外枠を、長方形となるように、射影で変換します。

M = cv2.getPerspectiveTransform(pts2, pts1)
img = cv2.warpPerspective(
    org_img, M, (w2 + 100, h2 + 100), borderValue=(255, 255, 255)
)

こちらが、補正の終わった画像です。完全とは行きませんが、最初のものから比べると、概ね長方形に整形されていると思います。

fax_output.png

コードは、次のとおりです。

sample001.py

import cv2
import os
import numpy as np
import cv2


class ImageTool:
    def __init__(self, base_name):
        self.dir = os.path.dirname(os.path.abspath(__file__))
        self.base_name = base_name

    def read(self):
        return cv2.imread("{}/{}.png".format(self.dir, self.base_name))

    def write(self, prefix, img):
        cv2.imwrite("{}/{}_{}.png".format(self.dir, self.base_name, prefix), img)


def main():
    base_name = "fax"
    image_tool = ImageTool(base_name)
    org_img = image_tool.read()

    # グレースケール変換
    gray_image = cv2.cvtColor(org_img, cv2.COLOR_BGR2GRAY)
    image_tool.write("glay", gray_image)

    # エッジ抽出
    edges_image = cv2.Canny(gray_image, 1, 100, apertureSize=3)
    image_tool.write("edges", edges_image)

    # 膨張処理
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    dilate_image = cv2.dilate(edges_image, kernel)
    image_tool.write("dilate", dilate_image)

    # 矩形検出
    contours, hierarchy = cv2.findContours(
        dilate_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
    )
    # 最大矩形を取得
    max_rect = 0
    max_area = 0
    for cnt, _ in zip(contours, hierarchy[0]):
        area = cv2.contourArea(cnt)
        if max_area < area:
            max_area = area
            max_rect = cv2.minAreaRect(cnt)
    rect_point = cv2.boxPoints(max_rect).astype(int)

    rect_image = org_img.copy()
    cv2.drawContours(rect_image, [rect_point], 0, (0, 0, 255), 5)
    image_tool.write("rect", rect_image)

    # 射影変換
    ((x1, y1), (x2, y2), (x3, y3), (x4, y4)) = rect_point
    margin = 100
    x1 -= margin
    x2 -= margin
    x3 += margin
    x4 += margin
    y1 += margin * 2
    y2 -= margin
    y3 -= margin
    y4 += margin * 2
    pts2 = [(x2, y2), (x1, y1), (x4, y4), (x3, y3)]

    w2 = max(pts2, key=lambda x: x[0])[0]
    h2 = max(pts2, key=lambda x: x[1])[1]
    h, w, _ = org_img.shape
    pts1 = np.float32([(0, 0), (0, h), (w, h), (w, 0)])
    pts2 = np.float32(pts2)

    M = cv2.getPerspectiveTransform(pts2, pts1)
    img = cv2.warpPerspective(
        org_img, M, (w2 + 100, h2 + 100), borderValue=(255, 255, 255)
    )
    image_tool.write("output", img)


if __name__ == "__main__":
    main()

3 帳票の検出

帳票の枠組みを検出する手順は、以下の通りです。グレースケール変換、エッジ抽出、膨張処理については、先の手順と同じです。

グレースケール変換
エッジ抽出
膨張処理
最大矩形検出
射影変換

グレースケール変換、エッジ抽出、膨張処理で得られた画像から、cv2.RETR_TREE で、すべての輪郭を抽出します。

contours, hierarchy = cv2.findContours(
        dilate_image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE
)

取得したすべての輪郭から、外接する矩形を計算しますが、この時、一定の大きさでフィルタしています。このフィルタの敷居値はFAX画像の解像度や、帳票の枠組みに依存していますので、それぞれで調整が必要です。

# 面積でフィルタリング
rects = []
for cnt, hrchy in zip(contours, hierarchy[0]):
    if cv2.contourArea(cnt) < 3000:
        continue  # 面積が一定の大きさを満たさないものを除く
    if cv2.contourArea(cnt) > 20000:
        continue  # 面積が一定の大きさを超えるものを除く
    if hrchy[3] == -1:
        continue  # ルートノードは除く
    # 輪郭を囲む長方形を計算する。
    rect = cv2.minAreaRect(cnt)
    rect_points = cv2.boxPoints(rect).astype(int)
    rects.append(rect_points)

検出した矩形を表示すると次のようになります。

fax_output1.png

fax_output2.png

コードは、次のとおりです。

sample001.py

import cv2
import os
import numpy as np
import cv2


class ImageTool:
    def __init__(self, base_name):
        self.dir = os.path.dirname(os.path.abspath(__file__))
        self.base_name = base_name

    def read(self):
        return cv2.imread("{}/{}.png".format(self.dir, self.base_name))

    def write(self, prefix, img):
        cv2.imwrite("{}/{}_{}.png".format(self.dir, self.base_name, prefix), img)


# 矩形描画
def disp_rects(rects, img, thickness):
    image = img.copy()
    for i, rect in enumerate(rects):
        color = np.random.randint(0, 255, 3).tolist()
        cv2.drawContours(image, rects, i, color, thickness)
    return image


def create_white_image(org_img):
    h, w, c = org_img.shape
    black_img = np.zeros((h, w, c), np.uint8)
    return black_img + 255


def main():
    base_name = "fax"
    image_tool = ImageTool(base_name)
    org_img = image_tool.read()
    white_img = create_white_image(org_img)

    # グレースケール変換
    gray_image = cv2.cvtColor(org_img, cv2.COLOR_BGR2GRAY)

    # エッジ抽出
    edges_image = cv2.Canny(gray_image, 1, 100, apertureSize=3)

    # 膨張処理
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    dilate_image = cv2.dilate(edges_image, kernel)

    # 輪郭抽出
    contours, hierarchy = cv2.findContours(
        dilate_image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE
    )
    # 面積でフィルタリング
    rects = []
    for cnt, hrchy in zip(contours, hierarchy[0]):
        if cv2.contourArea(cnt) < 3000:
            continue  # 面積が一定の大きさを満たさないものを除く
        if cv2.contourArea(cnt) > 20000:
            continue  # 面積が一定の大きさを超えるものを除く
        if hrchy[3] == -1:
            continue  # ルートノードは除く
        # 輪郭を囲む長方形を計算する。
        rect = cv2.minAreaRect(cnt)
        rect_points = cv2.boxPoints(rect).astype(int)
        rects.append(rect_points)

    thickness = 3
    image_tool.write("output1", disp_rects(rects, org_img, thickness))
    image_tool.write("output2", disp_rects(rects, white_img, thickness))


if __name__ == "__main__":
    main()

4 帳票の座標検出

矩形のデータは、線自体の幅や、検出の誤差により、そのまま座標として利用するのは難しいので、近似値を集約することで帳票の座標としています。

# 近似座標の集約
def consolidation(list):
    result = []
    min = 0
    counter = 0
    for val in list:
        if min == 0:
            min = val
            keep = val
        else:
            if keep + 3 < val:  # 3ドット以内をまとめる
                if counter > 2:  # 得意な検出は排除する
                    result.append(int(min + (keep - min) / 2))
                min = val
                counter = 0
            counter += 1
            keep = val

    if counter > 2:  # 特異な検出は排除する
        result.append(int(min + (keep - min) / 2))

    return result


# 座標検出
def detect_point(rects):
    # 全X,Y検出
    x_list = []
    y_list = []
    for i, rect in enumerate(rects):
        for i in range(4):
            x, y = rect[i]
            if not x in x_list:
                x_list.append(x)
            if not y in y_list:
                y_list.append(y)

    x_list.sort()
    y_list.sort()

    # 近似値の集約
    x_list = consolidation(x_list)
    y_list = consolidation(y_list)

    return x_list, y_list

次の図は、検出した座標で罫線を引いたものです。

fax_output1.png

fax_output2.png

コードです。

sample001.py

import cv2
import os
import numpy as np
import cv2


class ImageTool:
    def __init__(self, base_name):
        self.dir = os.path.dirname(os.path.abspath(__file__))
        self.base_name = base_name

    def read(self):
        return cv2.imread("{}/{}.png".format(self.dir, self.base_name))

    def write(self, prefix, img):
        cv2.imwrite("{}/{}_{}.png".format(self.dir, self.base_name, prefix), img)


# 矩形描画
def disp_rects(rects, img, thickness):
    image = img.copy()
    for i, rect in enumerate(rects):
        color = np.random.randint(0, 255, 3).tolist()
        cv2.drawContours(image, rects, i, color, thickness)
    return image


def create_white_image(org_img):
    h, w, c = org_img.shape
    black_img = np.zeros((h, w, c), np.uint8)
    return black_img + 255


# 近似座標の集約
def consolidation(list):
    result = []
    min = 0
    counter = 0
    for val in list:
        if min == 0:
            min = val
            keep = val
        else:
            if keep + 3 < val:  # 3ドット以内をまとめる
                if counter > 2:  # 得意な検出は排除する
                    result.append(int(min + (keep - min) / 2))
                min = val
                counter = 0
            counter += 1
            keep = val

    if counter > 2:  # 特異な検出は排除する
        result.append(int(min + (keep - min) / 2))

    return result


# 座標検出
def detect_point(rects):
    # 全X,Y検出
    x_list = []
    y_list = []
    for i, rect in enumerate(rects):
        for i in range(4):
            x, y = rect[i]
            if not x in x_list:
                x_list.append(x)
            if not y in y_list:
                y_list.append(y)

    x_list.sort()
    y_list.sort()

    # 近似値の集約
    x_list = consolidation(x_list)
    y_list = consolidation(y_list)

    return x_list, y_list


# LINE描画
def disp_line(x_list, y_list, img):
    image = img.copy()

    x_min = min(x_list)
    x_max = max(x_list)
    y_min = min(y_list)
    y_max = max(y_list)

    for x in x_list:
        cv2.line(
            image,
            pt1=(x, y_min),
            pt2=(x, y_max),
            color=(0, 0, 255),
            thickness=1,
            lineType=cv2.LINE_4,
            shift=0,
        )

    for y in y_list:
        cv2.line(
            image,
            pt1=(x_min, y),
            pt2=(x_max, y),
            color=(0, 0, 255),
            thickness=1,
            lineType=cv2.LINE_4,
            shift=0,
        )

    return image


def main():
    base_name = "fax"
    image_tool = ImageTool(base_name)
    org_img = image_tool.read()
    white_img = create_white_image(org_img)

    # グレースケール変換
    gray_image = cv2.cvtColor(org_img, cv2.COLOR_BGR2GRAY)
    # エッジ抽出
    edges_image = cv2.Canny(gray_image, 1, 100, apertureSize=3)
    # 膨張処理
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    dilate_image = cv2.dilate(edges_image, kernel)

    # 輪郭抽出
    contours, hierarchy = cv2.findContours(
        dilate_image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE
    )
    # 面積でフィルタリング
    rects = []
    for cnt, hrchy in zip(contours, hierarchy[0]):
        if cv2.contourArea(cnt) < 3000:
            continue  # 面積が一定の大きさを満たさないものを除く
        if cv2.contourArea(cnt) > 20000:
            continue  # 面積が一定の大きさを超えるものを除く
        if hrchy[3] == -1:
            continue  # ルートノードは除く
        # 輪郭を囲む長方形を計算する。
        rect = cv2.minAreaRect(cnt)
        rect_points = cv2.boxPoints(rect).astype(int)
        rects.append(rect_points)

    # 座標検出
    x_list, y_list = detect_point(rects)
    image_tool.write("output1", disp_line(x_list, y_list, org_img))
    image_tool.write("output2", disp_line(x_list, y_list, white_img))


if __name__ == "__main__":
    main()

5 Computer Vision 3.2 Read API

AI-OCRとしては、MixrosoftのComputer Visionで提供されている、Read APIを使用させて頂きました。

最新のモデルは、2022-04-30となっており、日本語利用では、現時点で、このRead APIが、最も精度が高いかも知れません。

参考: Computer Vision 3.2 GA Read API を呼び出す

ReadAPIのレスポンスは、下記のようになっています。

{
  "status": "succeeded",
  "createdDateTime": "2023-07-02T06:47:56Z",
  "lastUpdatedDateTime": "2023-07-02T06:47:57Z",
  "analyzeResult": {
    "version": "3.2.0",
    "modelVersion": "2022-04-30",
    "readResults": [
      {
        "page": 1,
        "angle": 0,
        "width": 2009,
        "height": 1218,
        "unit": "pixel",
        "lines": [
          {
            "boundingBox": [
              113,
              96,
              246,
              97,
              246,
              122,
              113,
              121
            ],
            "text": "資材注文情報",
            "appearance": {
              "style": {
                "name": "other",
                "confidence": 0.972
              }
            },
            "words": [
              {
                "boundingBox": [
                  119,
                  96,
                  134,
                  97,
                  133,
                  122,
                  118,
                  122
                ],
                "text": "資",
                "confidence": 0.989
              },
              {
                "boundingBox": [
                  142,
                  97,
                  157,
                  97,
                  157,
                  122,
                  141,
                  122
                ],
                "text": "材",
                "confidence": 0.965
              },
・・・略・・・

上記のレスポンスから、行（lines）としての検出された位置（boundingBox）に、赤枠を引き、その連番を記した画像と、そのテキスト出力です。

fax_output1.png

0: 資材注文情報
1: 注文ID
2: 建設会社コード
3: 建設会社名
4: 注文日時
5: 資材コード
6: 資材名
7: 数量
8: 単価
9: 合計金額
10: 注文ステータス
11: 1
12: CMP001
13: 建設A株式会社
14: 2023-06-30 12:30
15: MTL001
16: セメント
17: 100
18: 500
19: 50000
20: 処理中
・・・略・・・

コードの全体は以下の通りです。

sample001.py

import cv2
import os
import cv2
import json
import requests
import time


class ImageTool:
    def __init__(self, base_name):
        self.dir = os.path.dirname(os.path.abspath(__file__))
        self.base_name = base_name

    def file_name(self):
        return "{}/{}.png".format(self.dir, self.base_name)

    def read(self):
        return cv2.imread("{}/{}.png".format(self.dir, self.base_name))

    def write(self, prefix, img):
        cv2.imwrite("{}/{}_{}.png".format(self.dir, self.base_name, prefix), img)


def readApi(imFilePath):
    with open(imFilePath, "rb") as f:
        data = f.read()

    subscription_key = "xxxxxxxxxxxxxxxxxxxxxxxxx"
    endpoint = "https://japaneast.api.cognitive.microsoft.com/"
    model_version = "2022-04-30"
    language = "ja"

    text_recognition_url = endpoint + "vision/v3.2/read/analyze"
    headers = {
        "Ocp-Apim-Subscription-Key": subscription_key,
        "Content-Type": "application/octet-stream",
    }
    params = {"language ": language, "model-version": model_version}

    response = requests.post(
        text_recognition_url, headers=headers, params=params, json=None, data=data
    )
    response.raise_for_status()

    analysis = {}
    poll = True

    while poll:
        response_final = requests.get(
            response.headers["Operation-Location"], headers=headers
        )
        analysis = response_final.json()

        print(json.dumps(analysis, indent=4, ensure_ascii=False))

        time.sleep(1)
        if "analyzeResult" in analysis:
            poll = False
        if "status" in analysis and analysis["status"] == "failed":
            poll = False
    return analysis


def getXY(x_list, y_list, boundingBox):
    x1 = boundingBox[0]
    x2 = boundingBox[4]
    y1 = boundingBox[1]
    y2 = boundingBox[5]
    for y in range(len(y_list) - 1):
        top = y_list[y]
        bottom = y_list[y + 1] + 1
        for x in range(len(x_list) - 1):
            left = x_list[x]
            right = x_list[x + 1] + 1
            if top <= y1 and y2 <= bottom and left <= x1 and x2 <= right:
                return x, y
    return -1, -1


def main():
    base_name = "fax"
    image_tool = ImageTool(base_name)
    org_img = image_tool.read()

    # Computer Vision 3.2 Read API によるOCR読み取り
    response = readApi(image_tool.file_name())
    # JSON ファイルを出力
    with codecs.open("output_read3.2.json", "w+", "utf-8") as fp:
        json.dump(response, fp, ensure_ascii=False, indent=2)

    output_img = org_img.copy()
    readResult = response["analyzeResult"]["readResults"][0]
    lines = readResult["lines"]

    output = ""
    for i, line in enumerate(lines):
        text = line["text"]
        p = line["boundingBox"]

        cv2.rectangle(
            output_img, [p[0], p[1], (p[4] - p[0]), (p[5] - p[1])], (0, 0, 255), 1
        )
        cv2.putText(
            output_img,
            str(i),
            (p[0] - 10, p[1]),
            cv2.FONT_HERSHEY_PLAIN,
            1,
            (0, 0, 255),
            1,
            cv2.LINE_AA,
        )

        output += "{}: {}\n".format(i, text)

    print(output)
    image_tool.write("output1", output_img)


if __name__ == "__main__":
    main()

6 CSV出力

ReadAPIのレスポンスには、検出位置（boundingBox）が含まれるので、既に検出できている帳票の座標と付き合わせることで、検出された文字列が、どのセルに該当するかが分かります。

def getXY(x_list, y_list, boundingBox):
    x1 = boundingBox[0]
    x2 = boundingBox[4]
    y1 = boundingBox[1]
    y2 = boundingBox[5]
    for y in range(len(y_list) - 1):
        top = y_list[y]
        bottom = y_list[y + 1] + 1
        for x in range(len(x_list) - 1):
            left = x_list[x]
            right = x_list[x + 1] + 1
            if top <= y1 and y2 <= bottom and left <= x1 and x2 <= right:
                return x, y
    return -1, -1

各セルへの挿入を行い、列ごとに、CSVとして出力したものです。

注文ID,建設会社コード,建設会社名,注文日時,資材コード,資材名,数量,単価,合計金額,注文ステータス,,
1,CMP001,建設A株式会社,2023-06-30 12:30,MTL001,セメント,100,500,50000,処理中,,
,,建設B株式会社,2023-06-30 13:00,MTL002,鉄筋,50,20,10000,出荷済み,,
3,CMP003,建設C株式会社,2023-06-30 13:30,MTL003,砂利,200,300,60000,処理中,,
4,CMP004,建設D株式会社,2023-06-30 14:00,MTL004,コンクリートブロッ,25,800,20000,キャンセル,,
,,建設E株式会社,2023-06-30 15:00-,MTL005,木材,100,1000,100000,出荷済み,,
6,CMP001,建設A株式会社,2023-06-30 15:30,MTL006,塗料,30,2000,60000,処理中,,
,,建設B株式会社,2023-06-30 16:00,MTL007,ネジ,500,10,5000,出荷済み,,
8,CMP003,建設C株式会社,2023-06-30 16:30,MTL008,釘,1000,5,5000,処理中,,
・・・略・・・

見やすいようにExcelで読み込んでみました。完璧ではないですが、結構正確に帳票が再現できているように思います。

コードです。

sample001.py

import cv2
import os
import numpy as np
import cv2
import json
import requests
import time
import codecs


class ImageTool:
    def __init__(self, base_name):
        self.dir = os.path.dirname(os.path.abspath(__file__))
        self.base_name = base_name

    def read(self):
        return cv2.imread("{}/{}.png".format(self.dir, self.base_name))

    def write(self, prefix, img):
        cv2.imwrite("{}/{}_{}.png".format(self.dir, self.base_name, prefix), img)

    def file_name(self):
        return "{}/{}.png".format(self.dir, self.base_name)


# 矩形描画
def disp_rects(rects, img, thickness):
    image = img.copy()
    for i, rect in enumerate(rects):
        color = np.random.randint(0, 255, 3).tolist()
        cv2.drawContours(image, rects, i, color, thickness)
    return image


def create_white_image(org_img):
    h, w, c = org_img.shape
    black_img = np.zeros((h, w, c), np.uint8)
    return black_img + 255


# 近似座標の集約
def consolidation(list):
    result = []
    min = 0
    counter = 0
    for val in list:
        if min == 0:
            min = val
            keep = val
        else:
            if keep + 3 < val:  # 3ドット以内をまとめる
                if counter > 2:  # 得意な検出は排除する
                    result.append(int(min + (keep - min) / 2))
                min = val
                counter = 0
            counter += 1
            keep = val

    if counter > 2:  # 特異な検出は排除する
        result.append(int(min + (keep - min) / 2))

    return result


# 座標検出
def detect_point(rects):
    # 全X,Y検出
    x_list = []
    y_list = []
    for i, rect in enumerate(rects):
        for i in range(4):
            x, y = rect[i]
            if not x in x_list:
                x_list.append(x)
            if not y in y_list:
                y_list.append(y)

    x_list.sort()
    y_list.sort()

    # 近似値の集約
    x_list = consolidation(x_list)
    y_list = consolidation(y_list)

    return x_list, y_list


# LINE描画
def disp_line(x_list, y_list, img):
    image = img.copy()

    x_min = min(x_list)
    x_max = max(x_list)
    y_min = min(y_list)
    y_max = max(y_list)

    for x in x_list:
        cv2.line(
            image,
            pt1=(x, y_min),
            pt2=(x, y_max),
            color=(0, 0, 255),
            thickness=1,
            lineType=cv2.LINE_4,
            shift=0,
        )

    for y in y_list:
        cv2.line(
            image,
            pt1=(x_min, y),
            pt2=(x_max, y),
            color=(0, 0, 255),
            thickness=1,
            lineType=cv2.LINE_4,
            shift=0,
        )

    return image


def readApi(imFilePath):
    with open(imFilePath, "rb") as f:
        data = f.read()

    subscription_key = "xxxxxxxxxxxxxxxxxxxxxxxxx"
    endpoint = "https://japaneast.api.cognitive.microsoft.com/"
    model_version = "2022-04-30"
    language = "ja"

    text_recognition_url = endpoint + "vision/v3.2/read/analyze"
    headers = {
        "Ocp-Apim-Subscription-Key": subscription_key,
        "Content-Type": "application/octet-stream",
    }
    params = {"language ": language, "model-version": model_version}

    response = requests.post(
        text_recognition_url, headers=headers, params=params, json=None, data=data
    )
    response.raise_for_status()

    analysis = {}
    poll = True

    while poll:
        response_final = requests.get(
            response.headers["Operation-Location"], headers=headers
        )
        analysis = response_final.json()

        print(json.dumps(analysis, indent=4, ensure_ascii=False))

        time.sleep(1)
        if "analyzeResult" in analysis:
            poll = False
        if "status" in analysis and analysis["status"] == "failed":
            poll = False
    return analysis


def getXY(x_list, y_list, boundingBox):
    x1 = boundingBox[0]
    x2 = boundingBox[4]
    y1 = boundingBox[1]
    y2 = boundingBox[5]
    for y in range(len(y_list) - 1):
        top = y_list[y]
        bottom = y_list[y + 1] + 1
        for x in range(len(x_list) - 1):
            left = x_list[x]
            right = x_list[x + 1] + 1
            if top <= y1 and y2 <= bottom and left <= x1 and x2 <= right:
                return x, y
    return -1, -1


def main():
    base_name = "fax"
    image_tool = ImageTool(base_name)
    org_img = image_tool.read()
    white_img = create_white_image(org_img)

    # グレースケール変換
    gray_image = cv2.cvtColor(org_img, cv2.COLOR_BGR2GRAY)
    # エッジ抽出
    edges_image = cv2.Canny(gray_image, 1, 100, apertureSize=3)
    # 膨張処理
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    dilate_image = cv2.dilate(edges_image, kernel)

    # 輪郭抽出
    contours, hierarchy = cv2.findContours(
        dilate_image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE
    )
    # 面積でフィルタリング
    rects = []
    for cnt, hrchy in zip(contours, hierarchy[0]):
        if cv2.contourArea(cnt) < 3000:
            continue  # 面積が一定の大きさを満たさないものを除く
        if cv2.contourArea(cnt) > 20000:
            continue  # 面積が一定の大きさを超えるものを除く
        if hrchy[3] == -1:
            continue  # ルートノードは除く
        # 輪郭を囲む長方形を計算する。
        rect = cv2.minAreaRect(cnt)
        rect_points = cv2.boxPoints(rect).astype(int)
        rects.append(rect_points)

    # 座標検出
    x_list, y_list = detect_point(rects)

    Computer Vision 3.2 Read API によるOCR読み取り
    response = readApi(image_tool.file_name())
    # JSON ファイルを出力
    with codecs.open("output_read3.2.json", "w+", "utf-8") as fp:
        json.dump(response, fp, ensure_ascii=False, indent=2)

    # 出力用バッファ
    csv = []
    for _ in range(len(y_list)):
        row = []
        for _ in range(len(x_list)):
            row.append("")
        csv.append(row)

    # BoundingBoxを罫線位置に紹介して、CSV化する
    readResult = response["analyzeResult"]["readResults"][0]
    lines = readResult["lines"]
    for line in lines:
        text = line["text"]
        boundingBox = line["boundingBox"]
        x, y = getXY(x_list, y_list, boundingBox)
        if x == -1:
            print(">> {} {}".format(text, boundingBox))
        else:
            print("[{},{}] {}".format(x, y, text))
            csv[y][x] = text

    # CSV出力
    lines = []
    for i in range(len(csv)):
        row = csv[i]
        line = ""
        for col in row:
            line += col
            line += ","
        lines.append(line)

    with open("output.csv", mode="w") as f:
        for line in lines:
            f.write(line)
            f.write("\n")

    
if __name__ == "__main__":
    main()

7 修正

Excelなどに展開すると分かりやすいですが、読み込んだ結果は、一部のセルが欠落してしまっています。

実は、この原因は、検出範囲が、セルをまたがってしまっていて、テキストは検出されているが、帳票座標との突き合わせに失敗したものです。

また、最初から帳票の外に配置されたテキストも存在します。このように、いくつかの状況によっては、完全な自動化は難しく、手動での補正が必要となると思います。

そこで、手動で補正することを前提にして、帳票として処理できなかったものを出力してみました。

fax_output1.png

(1) 資材注文情報
(2) 2 CMP002
(3) 5 CMP005
(4) 7 CMP002
(5) 2023-06-30 22:30 MTL020
(6) 前回と同様の注文です。

CSV化すると同時に、このような出力を準備すると、手動による修正も捗るのではないかと思います。

コードです。

sample001.py

import cv2
import os
import numpy as np
import cv2
import json
import requests
import time

class ImageTool:
    def __init__(self, base_name):
        self.dir = os.path.dirname(os.path.abspath(__file__))
        self.base_name = base_name

    def read(self):
        return cv2.imread("{}/{}.png".format(self.dir, self.base_name))

    def write(self, prefix, img):
        cv2.imwrite("{}/{}_{}.png".format(self.dir, self.base_name, prefix), img)

    def file_name(self):
        return "{}/{}.png".format(self.dir, self.base_name)


# 矩形描画
def disp_rects(rects, img, thickness):
    image = img.copy()
    for i, rect in enumerate(rects):
        color = np.random.randint(0, 255, 3).tolist()
        cv2.drawContours(image, rects, i, color, thickness)
    return image


def create_white_image(org_img):
    h, w, c = org_img.shape
    black_img = np.zeros((h, w, c), np.uint8)
    return black_img + 255


# 近似座標の集約
def consolidation(list):
    result = []
    min = 0
    counter = 0
    for val in list:
        if min == 0:
            min = val
            keep = val
        else:
            if keep + 3 < val:  # 3ドット以内をまとめる
                if counter > 2:  # 得意な検出は排除する
                    result.append(int(min + (keep - min) / 2))
                min = val
                counter = 0
            counter += 1
            keep = val

    if counter > 2:  # 特異な検出は排除する
        result.append(int(min + (keep - min) / 2))

    return result


# 座標検出
def detect_point(rects):
    # 全X,Y検出
    x_list = []
    y_list = []
    for i, rect in enumerate(rects):
        for i in range(4):
            x, y = rect[i]
            if not x in x_list:
                x_list.append(x)
            if not y in y_list:
                y_list.append(y)

    x_list.sort()
    y_list.sort()

    # 近似値の集約
    x_list = consolidation(x_list)
    y_list = consolidation(y_list)

    return x_list, y_list


# LINE描画
def disp_line(x_list, y_list, img):
    image = img.copy()

    x_min = min(x_list)
    x_max = max(x_list)
    y_min = min(y_list)
    y_max = max(y_list)

    for x in x_list:
        cv2.line(
            image,
            pt1=(x, y_min),
            pt2=(x, y_max),
            color=(0, 0, 255),
            thickness=1,
            lineType=cv2.LINE_4,
            shift=0,
        )

    for y in y_list:
        cv2.line(
            image,
            pt1=(x_min, y),
            pt2=(x_max, y),
            color=(0, 0, 255),
            thickness=1,
            lineType=cv2.LINE_4,
            shift=0,
        )

    return image


def readApi(imFilePath):
    with open(imFilePath, "rb") as f:
        data = f.read()

    subscription_key = "xxxxxxxxxxxxxxxxxxxxxxxxx"
    endpoint = "https://japaneast.api.cognitive.microsoft.com/"
    model_version = "2022-04-30"
    language = "ja"

    text_recognition_url = endpoint + "vision/v3.2/read/analyze"
    headers = {
        "Ocp-Apim-Subscription-Key": subscription_key,
        "Content-Type": "application/octet-stream",
    }
    params = {"language ": language, "model-version": model_version}

    response = requests.post(
        text_recognition_url, headers=headers, params=params, json=None, data=data
    )
    response.raise_for_status()

    analysis = {}
    poll = True

    while poll:
        response_final = requests.get(
            response.headers["Operation-Location"], headers=headers
        )
        analysis = response_final.json()

        print(json.dumps(analysis, indent=4, ensure_ascii=False))

        time.sleep(1)
        if "analyzeResult" in analysis:
            poll = False
        if "status" in analysis and analysis["status"] == "failed":
            poll = False
    return analysis


def getXY(x_list, y_list, boundingBox):
    x1 = boundingBox[0]
    x2 = boundingBox[4]
    y1 = boundingBox[1]
    y2 = boundingBox[5]
    for y in range(len(y_list) - 1):
        top = y_list[y]
        bottom = y_list[y + 1] + 1
        for x in range(len(x_list) - 1):
            left = x_list[x]
            right = x_list[x + 1] + 1
            if top <= y1 and y2 <= bottom and left <= x1 and x2 <= right:
                return x, y
    return -1, -1


def main():
    base_name = "fax"
    image_tool = ImageTool(base_name)
    org_img = image_tool.read()
    white_img = create_white_image(org_img)

    # グレースケール変換
    gray_image = cv2.cvtColor(org_img, cv2.COLOR_BGR2GRAY)
    # エッジ抽出
    edges_image = cv2.Canny(gray_image, 1, 100, apertureSize=3)
    # 膨張処理
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    dilate_image = cv2.dilate(edges_image, kernel)

    # 輪郭抽出
    contours, hierarchy = cv2.findContours(
        dilate_image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE
    )
    # 面積でフィルタリング
    rects = []
    for cnt, hrchy in zip(contours, hierarchy[0]):
        if cv2.contourArea(cnt) < 3000:
            continue  # 面積が一定の大きさを満たさないものを除く
        if cv2.contourArea(cnt) > 20000:
            continue  # 面積が一定の大きさを超えるものを除く
        if hrchy[3] == -1:
            continue  # ルートノードは除く
        # 輪郭を囲む長方形を計算する。
        rect = cv2.minAreaRect(cnt)
        rect_points = cv2.boxPoints(rect).astype(int)
        rects.append(rect_points)

    # 座標検出
    x_list, y_list = detect_point(rects)

    # Computer Vision 3.2 Read API によるOCR読み取り
    response = readApi(image_tool.file_name())
    # JSON ファイルを出力
    with codecs.open("output_read3.2.json", "w+", "utf-8") as fp:
        json.dump(response, fp, ensure_ascii=False, indent=2)

    # 出力用バッファ
    csv = []
    for _ in range(len(y_list)):
        row = []
        for _ in range(len(x_list)):
            row.append("")
        csv.append(row)

    outline_text = ""
    outline_counter = 1
    output_img = org_img.copy()

    # BoundingBoxを罫線位置に紹介して、CSV化する
    readResult = response["analyzeResult"]["readResults"][0]
    lines = readResult["lines"]
    for line in lines:
        text = line["text"]
        boundingBox = line["boundingBox"]
        x, y = getXY(x_list, y_list, boundingBox)
        if x == -1:
            cv2.rectangle(
                output_img,
                [
                    boundingBox[0],
                    boundingBox[1],
                    (boundingBox[4] - boundingBox[0]),
                    (boundingBox[5] - boundingBox[1]),
                ],
                (0, 0, 255),
                2,
            )
            cv2.putText(
                output_img,
                "({})".format(outline_counter),
                (boundingBox[0] - 50, boundingBox[1] + 20),
                cv2.FONT_HERSHEY_PLAIN,
                2,
                (0, 0, 255),
                2,
                cv2.LINE_AA,
            )
            outline_text += "({}) {}\n".format(outline_counter, text)
            outline_counter += 1
        else:
            csv[y][x] = text

    # 帳票外の出力
    image_tool.write("output1", output_img)
    print(outline_text)

    # CSV出力
    lines = []
    for i in range(len(csv)):
        row = csv[i]
        line = ""
        for col in row:
            line += col
            line += ","
        lines.append(line)

    with open("output.csv", mode="w") as f:
        for line in lines:
            f.write(line)
            f.write("\n")


if __name__ == "__main__":
    main()